Brian S. Evans, Ph.D.
Migratory Bird Center
Smithsonian Conservation Biology Institute
# Load libraries:
library(tidyverse)
The function paste0 is used to paste two string values together. For example, we can paste the values 'hello' and 'World' together as follows. The resultant object is a nice looking camel case value.
# The paste0 function:
paste0('hello', 'World')## [1] "helloWorld"
We will use the function paste0 to create an easy-to-read url
gitUrl <-
'https://raw.githubusercontent.com/bsevansunc/'
courseData <-
'smsc_data_science/master/data/'
paste0(
gitUrl,
courseData,
'birdHabits.csv')## [1] "https://raw.githubusercontent.com/bsevansunc/smsc_data_science/master/data/birdHabits.csv"
# Read in the data:
birdCounts <-
read_csv(
paste0(
gitUrl,
courseData,
'bird_rawCounts.csv'))# Read in the data:
birdHabits <-
read_csv(
paste0(
gitUrl,
courseData,
'birdHabits.csv'))For the iris dataset, I think it’s best to do some familiar cleaning steps:
# Clean up iris for analysis:
irisTbl <- tbl_df(iris)
names(irisTbl) <-
c('sepalLength',
'sepalWidth',
'petalLength',
'petalWidth',
'species')Functions are a type of R object that consists of commands that can be used to execute complex or repetitive tasks.
Functions take the form:
functionName <-
function(functionTarget) {
functionBody
}# First function:
addOneFun <-
function(x) {
x + 1
}# Testing the function on a numeric value:
42+1
addOneFun(42)# First function:
addOneFun <-
function(x) {
x + 1
}# Testing the function on a vector of numeric values:
v <-
c(1, 1, 2, 3, 5)
v + 1
Functions can simplify writing queries!
# Explore birdCounts data:
str(birdCounts)
head(birdCounts)
# Matrix notation query:
birdCounts[birdCounts$species == 'grca', ]
Functions can simplify writing queries!
# Query by species function:
speciesSubset <-
function(spp) {
birdCounts[birdCounts$species == spp,]
}# Test function:
birdCounts[birdCounts$species == 'grca', ]
speciesSubset('grca')# Query by species function, generalized:
speciesSubset <-
function(dfIn, spp) {
dfIn[dfIn$species == spp,]
}# Test function, birdCounts:
birdCounts[birdCounts$species == 'grca', ]
speciesSubset(birdCounts, 'grca')
# Test function, birdHabits:
birdHabits[birdHabits$species == 'grca', ]
speciesSubset(birdHabits, 'grca')In many, but not all situations.
# Subset to catbirds using $ and matrix notation:
birdHabits[birdHabits$species == 'grca', ]
birdHabits[birdHabits[,'species'] == 'grca',]# Very generalized query:
query <-
function(dfIn, variable, condition) {
dfIn[dfIn[, variable] == condition, ]
}
# Test query:
birdHabits[birdHabits$species == 'grca', ]
birdHabits[birdHabits[,'species'] == 'grca',]
query(birdHabits, 'species', 'grca')birdHabits data frame to just ground foraging birds.
birdCounts data frame(i.e., the sum of count for a data frame subset by species)
birdHabits data frame to just ground foraging birds.
head(birdHabits)
birdHabits[birdHabits$foraging == 'ground',]
foragingSubset <-
function(foragingValue) {
birdHabits[birdHabits$foraging == foragingValue, ]
}
foragingSubsetGeneral <-
function(dfIn, foragingValue) {
dfIn[dfIn$foraging == foragingValue, ]
}
foragingSubset('ground')
foragingSubsetGeneral(birdHabits, 'ground')birdCounts data frame(i.e., the sum of count for a data frame subset by species)
head(birdCounts)
birdCounts[birdCounts$species == 'grca',]
birdCounts[birdCounts$species == 'grca',]$count
sum(birdCounts[birdCounts$species == 'grca',]$count)
speciesN <-
function(dfIn, spp) {
sum(dfIn[dfIn$species == spp,]$count)
}
speciesN(birdCounts, 'grca')# Query function, mean count:
meanSpeciesCounts <-
function(spp) {
# Number of unique site values:
nSites <- length(unique(birdCounts$site))
# Subset birdCounts to the species of interest:
birdCounts_sppSubset <- birdCounts[birdCounts$species == spp,]
# Calculate the total number of birds observed:
nBirds <- sum(birdCounts_sppSubset$count)
# Return mean number of birds per site:
return(nBirds / nSites)
}
# What is the average number of observed catbirds?
meanSpeciesCounts('grca')# Query by species function, generalized:
meanSpeciesCounts <-
function(dfIn, spp) {
# Number of unique site values:
nSites <- length(unique(dfIn$site))
# Calculate the total number of birds observed:
nBirds <- sum(speciesSubset(dfIn, spp)$count)
# Return mean number of birds per site:
return(nBirds / nSites)
}
# What is the average number of observed catbirds?
meanSpeciesCounts(birdCounts, 'grca')birdHabits data frame, write a function to count the number of species in a given diet and foraging guild.
sd and the function for square root is sqrt):
birdHabits data frame, write a function to count the number of species in a given diet and foraging guild.
dietForagingSppCount <-
function(dietValue, foragingValue) {
dataSubset <-
birdHabits[birdHabits$diet == dietValue &
birdHabits$foraging == foragingValue, ]
length(
unique(foragingSubset$species)))
}
dietForagingSppCount('omnivore', 'ground')sd and the function for square root is sqrt):
se <-
function(x) {
sd(x) / sqrt(length(x))
}Why would you use for loops?
# Filter irisTbl to setosa:
irisTbl[irisTbl$species == 'setosa', ]
# Extract the petalLength field (column):
irisTbl[irisTbl$species == 'setosa', ]$petalLength
# Calculate the mean of petal lengths:
mean(irisTbl[irisTbl$species == 'setosa', ]$petalLength)Calculate the mean petal length of each of the Iris species using matrix notation and a custom function.
Calculate the mean petal length of each of the Iris species using matrix notation (as above) and a custom function.
# Mean petal lengths, matrix notation:
mean(
irisTbl[irisTbl$species == 'setosa', ]$petalLength)
mean(
irisTbl[irisTbl$species == 'versicolor', ]$petalLength)
mean(
irisTbl[irisTbl$species == 'virginica', ]$petalLength)
# Mean petal lengths, function method:
meanPetalFun <-
function(spp) {
mean(
irisTbl[irisTbl$species == spp,]$petalLength)
}
meanPetalFun('setosa')
meanPetalFun('versicolor')
meanPetalFun('virginica')
Construct a vector, v using a set of five numbers.
# Generate vector v:
v <-
c(1, 1, 2, 3, 5)
v
Modify the values in vector v by adding one to each value. This might be written mathematically as:
Writing proper for loops requires following these three steps:
Recall that value v[i] is equal to the value at position i in vector v. Let’s take a look at the value of v at position 3:
# Explore vector v using indexing:
i <- 3
v[i]
v[3]
v[3] == v[i]
Recall that value v[i] is equal to the value at position i in vector v. Let’s take a look at the value of v at position 3:
# Add 1 to the value of v at position three:
v[3] + 1
v[i] + 1ALWAYS specify an object to store your output!
Vector objects are defined as:
# Define a vector for output:
vNew <-
vector('numeric', length = length(v))
str(vNew)ALWAYS specify an object to store your output!
# Explore filling values of vNew by index:
i <- 3
v[i]
vNew[i] <- v[i] + 1
vNew[i]
v[i] + 1 == vNew[i]The sequence can be defined as:
v
1:5
1:length(v)
seq_along(v)
# Example for loop sequence statements:
# for(i in 1:length(v))
# for(i in seq_along(v))The for loop body describes what will happen at each iteration of the loop. For example:
i <- 3
vNew[i] <- v[i] + 1# First for loop:
vNew <-
numeric(length = length(v))
for(i in seq_along(v)) {
vNew[i] <- v[i] + 1
}
# Explore first for loop output:
vNew
v
vNew == vSplit-Apply-Combine
# Mean petal lengths of Iris species without a for loop:
mean(irisTbl[irisTbl$species == 'setosa', ]$petalLength)
mean(irisTbl[irisTbl$species == 'versicolor', ]$petalLength)
mean(irisTbl[irisTbl$species == 'virginica', ]$petalLength)Split-Apply-Combine
Start by creating a vector of species:
# Make a vector of species to loop across:
irisSpecies <- levels(irisTbl$species)
irisSpeciesSplit-Apply-Combine
Create an empty vector to store our output:
# For loop output statement:
petalLengths <-
vector('numeric',length = length(irisSpecies))
petalLengthsSplit-Apply-Combine
Split: The for loop body, starts with splitting the data
# Exploring the iris data, subsetting by species:
i <- 3
irisSpecies[i]
irisTbl[irisTbl$species == irisSpecies[i], ]
# Split:
iris_sppSubset <-
irisTbl[irisTbl$species == irisSpecies[i], ]Split-Apply-Combine
Apply: Modification of the data:
# Calculate mean petal length of each subset:
mean(iris_sppSubset$petalLength)Split-Apply-Combine
# Make a vector of species to loop across:
irisSpecies <-
levels(irisTbl$species)
# For loop output statement:
petalLengths <-
vector('numeric',length = length(irisSpecies))
# For loop:
for(i in seq_along(irisSpecies)) {
# Split:
iris_sppSubset <- irisTbl[irisTbl$species == irisSpecies[i],]
# Apply:
petalLengths[i] <- mean(iris_sppSubset$petalLength)
}Split-Apply-Combine
Combine: Combining the for loop output
# Make a tibble data frame of the for loop output:
petalLengthFrame <-
tibble(species = irisSpecies, count = petalLengths)
petalLengthFrame
Use a for loop and the birdHabits data frame to calculate the number species in each diet guild.
Use a for loop and the birdHabits data frame to calculate the number species in each diet guild.
diets <-
unique(birdHabits$diet)
outVector <-
vector('numeric', length = length(diets))
for(i in seq_along(outVector)) {
# Split:
dietSubset <-
birdHabits[birdHabits$diet == diets[i], ]
# Apply:
outVector[i] <- nrow(dietSubset)
}
# Combine:
tibble(
diet = diets,
nSpecies = outVector)For loops can be used to explore data objects with common features.
How many omnivorous birds were observed at each site?
# Explore the bird count data:
head(birdCounts)
str(birdCounts)
# Explore the bird trait data:
head(birdHabits)
str(birdHabits)How many omnivorous birds were observed at each site?
Get a vector of birds that are ground foragers from the birdHabits data frame:
# Extract vector of omnivorous species:
omnivores <-
birdHabits[birdHabits$diet == 'omnivore',]$speciesHow many omnivorous birds were observed at each site?
Split the data into individual sites.
# Generate a vector of unique sites:
sites <-
unique(birdCounts$site)
# Site at position i:
i <- 3
sites[i]
# Subset data:
birdCounts_siteSubset <-
birdCounts[birdCounts$site == sites[i],]
birdCounts_siteSubsetHow many omnivorous birds were observed at each site?
Split: Use %in% to extract only records associated with omnivores and sum the count field.
# Just a vector of omnivore counts:
countVector <-
birdCounts_siteSubset[birdCounts_siteSubset$species %in%
omnivores,]$countHow many omnivorous birds were observed at each site?
Apply: Sum the count vector.
# Get total number of omnivores at the site:
nOmnivores <- sum(countVector)How many omnivorous birds were observed at each site?
Combine: Values combined using the vector method
sites <- unique(birdCounts$site)
outVector <-
vector('numeric', length = length(unique(sites)))
for(i in seq_along(sites)) {
birdCounts_siteSubset <-
birdCounts[birdCounts$site == sites[i], ]
countVector <-
birdCounts_siteSubset[birdCounts_siteSubset$species %in%
omnivores,]$count
outVector[i] <- sum(countVector)
}
# Combine:
tibble(
site = sites,
nOmnivores = outVector)How many omnivorous birds were observed at each site?
Combine: Values combined using the list method
sites <-
unique(birdCounts$site)
outList <-
vector('list', length = length(unique(sites)))
for(i in seq_along(sites)) {
birdCounts_siteSubset <-
birdCounts[birdCounts$site == sites[i], ]
countVector <-
birdCounts_siteSubset[birdCounts_siteSubset$species %in%
omnivores, ]$count
outList[[i]] <-
data_frame(site = sites[i],
nOmnivores = sum(countVector))
}
# Combine:
bind_rows(outList)For loop to generate a vector of numbers based on some mathematical function. For example:
\[n_t = 2(n_{t-1})\]
For loop to generate a vector of numbers based on some mathematical function. For example:
\[n_t = 2(n_{t-1})\]
# For loop output:
n <- vector('numeric', length = 5)
n
# Set the seed value:
n[1] <- 10
nFor loop to generate a vector of numbers based on some mathematical function. For example:
\[n_t = 2(n_{t-1})\]
# For loop sequence:
# for(i in 2:length(n))For loop to generate a vector of numbers based on some mathematical function. For example:
\[n_t = 2(n_{t-1})\]
Body: For each iteration (example, position 2):
# Exploring the construction of the for loop body:
i <- 2
n[i]
n[i-1]
n[i] <- 2*n[i-1]
nFor loop to generate a vector of numbers based on some mathematical function. For example:
\[n_t = 2(n_{t-1})\]
# Output:
n <- vector('numeric', length = 5)
# Seed:
n[1] <- 10
# For loop:
for(i in 2:5){
n[i] = n*v[i-1]
}One of my favorite for loops was created by Leonardo Bonacci (Fibonacci). He created the first known population model, from which the famous Fibonacci number series was created. He described a population (N) of rabbits at time t as the sum of the population at the previous time step plus the time step before that:
\[N_t = N_{t-1} + N_{t-2}\]